{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python\n",
    "# -*- coding: utf-8 -*-\n",
    "\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "\n",
    "import argparse\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "from importlib import reload  \n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn import svm\n",
    "from sklearn.utils import shuffle\n",
    "\n",
    "from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM\n",
    "from loglizer import dataloader, preprocessing\n",
    "from loglizer.utils import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ouput_dir = \"../output/bgl/\"\n",
    "middle_dir = \"\"\n",
    "log_file = \"BGL.log\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<!-- # Produce event templates from train test dataset -->"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split train test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:286: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  train = np.array(train).reshape(-1,1)\n",
      "/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:292: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  test_normal = np.array(test_normal).reshape(-1,1)\n",
      "/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:298: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
      "  abnormal = np.array(abnormal).reshape(-1,1)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train normal size: 13718\n",
      "Train abnormal size: 1207\n",
      "Total logkey(exclude 0:UNK) 1000\n",
      "Test normal size: 20579\n",
      "Test abnormal size: 1811\n",
      "num_unk_event in test data: 0\n"
     ]
    }
   ],
   "source": [
    "(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "====== Transformed train data summary ======\n",
      "Train data shape: 14925-by-832\n",
      "\n",
      "====== Transformed test data summary ======\n",
      "Test data shape: 22390-by-832\n",
      "\n"
     ]
    }
   ],
   "source": [
    "feature_extractor = preprocessing.FeatureExtractor()\n",
    "x_train = feature_extractor.fit_transform(x_train)\n",
    "x_test = feature_extractor.transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: PCA ====================\n",
      "theshold 0\n",
      "====== Model summary ======\n",
      "n_components: 5\n",
      "Project matrix shape: 832-by-832\n",
      "SPE threshold: 1\n",
      "\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 1193, FP: 11915, TN: 1803, FN: 14\n",
      "Precision: 9.101%, recall: 98.840%, F1-measure: 16.668%\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 1777, FP: 17824, TN: 2755, FN: 34\n",
      "Precision: 9.066%, recall: 98.123%, F1-measure: 16.598%\n",
      "\n",
      "CPU times: user 16.9 s, sys: 66.9 ms, total: 17 s\n",
      "Wall time: 1.73 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: PCA \" + \"=\"*20)\n",
    "for th in np.arange(1):\n",
    "    print(\"theshold\", th)\n",
    "    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)\n",
    "    model.fit(x_train)\n",
    "    print('Train validation:')\n",
    "    precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "    print('Test validation:')\n",
    "    precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: IsolationForest ====================\n",
      "====== Model summary ======\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 174, FP: 0, TN: 13718, FN: 1033\n",
      "Precision: 100.000, recall: 14.416, F1-measure: 25.199\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 258, FP: 0, TN: 20579, FN: 1553\n",
      "Precision: 100.000, recall: 14.246, F1-measure: 24.940\n",
      "\n",
      "CPU times: user 17.2 s, sys: 2.96 s, total: 20.2 s\n",
      "Wall time: 18.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: IsolationForest \" + \"=\"*20)\n",
    "model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)\n",
    "model.fit(x_train)\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: one class SVM ====================\n",
      "====== Model summary ======\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 152, FP: 13718, TN: 0, FN: 1055\n",
      "Precision: 1.096, recall: 12.593, F1-measure: 2.016\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 227, FP: 20579, TN: 0, FN: 1584\n",
      "Precision: 1.091, recall: 12.534, F1-measure: 2.007\n",
      "\n",
      "CPU times: user 6min 39s, sys: 69.4 ms, total: 6min 39s\n",
      "Wall time: 6min 39s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: one class SVM \" + \"=\"*20)\n",
    "model = OneClassSVM(kernel='rbf')\n",
    "model.fit(x_train, y_train)\n",
    "\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================== Model: LogClustering ====================\n",
      "====== Model summary ======\n",
      "Starting offline clustering...\n",
      "Processed 1000 instances.\n",
      "Found 92 clusters offline.\n",
      "\n",
      "Starting online clustering...\n",
      "Processed 2000 instances.\n",
      "Processed 4000 instances.\n",
      "Processed 6000 instances.\n",
      "Processed 8000 instances.\n",
      "Processed 10000 instances.\n",
      "Processed 12000 instances.\n",
      "Processed 13718 instances.\n",
      "Found 172 clusters online.\n",
      "\n",
      "Train validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 775, FP: 1, TN: 13717, FN: 432\n",
      "Precision: 99.871, recall: 64.209, F1-measure: 78.164\n",
      "\n",
      "Test validation:\n",
      "====== Evaluation summary ======\n",
      "Confusion Matrix: TP: 1215, FP: 64, TN: 20515, FN: 596\n",
      "Precision: 94.996, recall: 67.090, F1-measure: 78.641\n",
      "\n",
      "CPU times: user 1min 42s, sys: 28.1 ms, total: 1min 42s\n",
      "Wall time: 1min 42s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"=\"*20 + \" Model: LogClustering \" + \"=\"*20)\n",
    "max_dist = 0.3  # the threshold to stop the clustering process\n",
    "anomaly_threshold = 0.3  # the threshold for anomaly detection\n",
    "model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)\n",
    "model.fit(x_train[y_train == 0, :])  # Use only normal samples for training\n",
    "print('Train validation:')\n",
    "precision, recall, f1 = model.evaluate(x_train, y_train)\n",
    "print('Test validation:')\n",
    "precision, recall, f1 = model.evaluate(x_test, y_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
